import urllib.request as ur
import urllib.parse as up
import re

# key_word = input('输入搜索关键字: ')
# page_numb_start = int(input('输入爬取起始页: '))
# page_numb_end = int(input('输入爬取终止页: '))
key_word='java'
page_numb_start = 1
page_numb_end = 1

#获取get请求的url
def geturlencode(page_numb,key_word):
    data = { 'p' : page_numb, 'q' : key_word}
    url_encode = up.urlencode(data)
    base_url = 'https://so.csdn.net/so/search/s.do?'
    return base_url + url_encode

#获取传入url的html页面信息
def gethtml(url):
    request =ur.Request(
        url = url,
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
            'Cookie': 'acw_tc=2760820015767202930997162e9e84db4bc3ba7a48e31dc522436d30285afc; acw_sc__v2=5dfad7cc8b2d0d26b4965e1f61c89c8633d4efa5; uuid_tt_dd=10_6654531260-1576720373837-502623; dc_session_id=10_1576720373837.288858; dc_tos=q2ql8a; c-login-auto=1; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1576720384; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1576720384; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_6654531260-1576720373837-502623; announcement=%257B%2522isLogin%2522%253Afalse%252C%2522announcementUrl%2522%253A%2522https%253A%252F%252Fblog.csdn.net%252Fblogdevteam%252Farticle%252Fdetails%252F103603408%2522%252C%2522announcementCount%2522%253A1%252C%2522announcementExpire%2522%253A211502583%257D; firstDie=1'
        }
    )
    return  ur.urlopen(request).read().decode('utf-8')

for i in range(min(page_numb_start,page_numb_end),max(page_numb_start,page_numb_end)+1):
    html = gethtml(geturlencode(i,key_word))
    patten = '<span class="link"><a href="(.*?)" target="_blank">'
    ret = re.findall(patten, html)
    for ii in ret:
        html1 = gethtml(ii)
        patten1 = 'articleTitle = "(.*?)";'
        name = re.findall(patten1,html1)[0] +'.html'
        print(name)
        with open(name,'wb') as f:
            f.write(html1.encode('utf-8'))

